1 Project Overview

1.1 Project objective

  • Tryout PostgreSQL
  • Learn how to write basic SQL queries to get required data from DB
  • Perform data analysis with R (dplyr, plotly) with the DVD

1.2 Used tools

  • PostgreSQL
  • R (DBI, dplyr, plotly)

2 SQL Query for Getting Data from DB

pgdrv <- dbDriver("PostgreSQL")

con <- dbConnect(pgdrv, dbname="dvd", 
                 port="5432", 
                 user="postgres", 
                 password=pd, 
                 host="localhost")

top_rented <- dbGetQuery(con, "SELECT c.name, COUNT(c.name) AS count_genre, SUM(p.amount) AS total_sale FROM payment p
INNER JOIN rental r ON p.rental_id = r.rental_id
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film_category fc ON fc.film_id = i.film_id
INNER JOIN category c ON c.category_id = fc.category_id
GROUP BY c.name
ORDER BY count_genre DESC
LIMIT 1")

least_rented <- dbGetQuery(con, "SELECT c.name, COUNT(c.name) AS count_genre, SUM(p.amount) AS total_sale FROM payment p
INNER JOIN rental r ON p.rental_id = r.rental_id
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film_category fc ON fc.film_id = i.film_id
INNER JOIN category c ON c.category_id = fc.category_id
GROUP BY c.name
ORDER BY count_genre ASC
LIMIT 1")

du_genre <- dbGetQuery(con, "SELECT x.name, COUNT(x.name) as num_user FROM (
    SELECT DISTINCT c.name, r.customer_id FROM category c
    INNER JOIN film_category fc ON c.category_id = fc.category_id
    INNER JOIN inventory i ON i.film_id = fc.film_id
    INNER JOIN rental r ON r.inventory_id = i.inventory_id) x 
GROUP BY x.name
ORDER BY x.name")

ave_ren_rate <- dbGetQuery(con, "SELECT c.name, ROUND(AVG(f.rental_rate),2) AS average_rental_rate FROM category c 
INNER JOIN film_category fc ON c.category_id = fc.category_id 
INNER JOIN film f ON f.film_id = fc.film_id 
GROUP BY c.name 
ORDER BY average_rental_rate DESC;")

re_time <- dbGetQuery(con, "SELECT x1.result, COUNT(x1.result) FROM (
    SELECT (
        CASE WHEN res.rental_duration > res.bw THEN 'early'
        WHEN res.rental_duration = res.bw THEN 'on time'
        WHEN res.rental_duration < res.bw THEN 'late'
        ELSE 'NA'
        END) AS result
    FROM (SELECT x.* 
        FROM (
            SELECT f.film_id, f.rental_duration, DATE_PART('day',r.return_date - r.rental_date) AS bw 
            FROM rental r 
            INNER JOIN inventory i ON i.inventory_id = r.inventory_id 
            INNER JOIN film f ON f.film_id = i.film_id
        ) x
    ) res
) x1
GROUP BY x1.result;")

have_presence <- dbGetQuery(con, "SELECT c.country, COUNT(DISTINCT cus.customer_id) AS customer_base, SUM(p.amount) AS total_sale FROM customer cus 
INNER JOIN address a ON cus.address_id = a.address_id
INNER JOIN city ct ON a.city_id = ct.city_id
INNER JOIN country c ON ct.country_id = c.country_id
INNER JOIN payment p ON p.customer_id = cus.customer_id
GROUP BY c.country
ORDER BY total_sale DESC;")

# get DB as a whole for in-depth analysis
category <- dbGetQuery(con, "SELECT category_id, name FROM category")
film_category <- dbGetQuery(con, "SELECT film_id, category_id FROM film_category")
inventory <- dbGetQuery(con, "SELECT film_id, inventory_id FROM inventory")
rental <- dbGetQuery(con, "SELECT rental_id, inventory_id FROM rental")
payment <- dbGetQuery(con, "SELECT payment_id, payment_date, rental_id, amount FROM payment")
film <- dbGetQuery(con, "SELECT film_id, title, rental_duration FROM film")

# get sales and rent of each genre (desc)
genre_rented <- dbGetQuery(con, "SELECT c.name, COUNT(c.name) AS count_genre, SUM(p.amount) AS total_sale FROM payment p
INNER JOIN rental r ON p.rental_id = r.rental_id
INNER JOIN inventory i ON i.inventory_id = r.inventory_id
INNER JOIN film_category fc ON fc.film_id = i.film_id
INNER JOIN category c ON c.category_id = fc.category_id
GROUP BY c.name
ORDER BY count_genre DESC")

3 Basic Analysis

3.1 Most and least lented genre

top_rented
##     name count_genre total_sale
## 1 Sports        1081    4892.19
least_rented
##    name count_genre total_sale
## 1 Music         750    3071.52

3.2 Number of distinct users that have rented each genre

du_genre
##           name num_user
## 1       Action      510
## 2    Animation      500
## 3     Children      482
## 4     Classics      468
## 5       Comedy      495
## 6  Documentary      483
## 7        Drama      501
## 8       Family      501
## 9      Foreign      493
## 10       Games      474
## 11      Horror      451
## 12       Music      447
## 13         New      468
## 14      Sci-Fi      507
## 15      Sports      519
## 16      Travel      442

3.3 Average rental rate for each genre

ave_ren_rate
##           name average_rental_rate
## 1        Games                3.25
## 2       Travel                3.24
## 3       Sci-Fi                3.22
## 4       Comedy                3.16
## 5       Sports                3.13
## 6          New                3.12
## 7      Foreign                3.10
## 8       Horror                3.03
## 9        Drama                3.02
## 10       Music                2.95
## 11    Children                2.89
## 12   Animation                2.81
## 13      Family                2.76
## 14    Classics                2.74
## 15 Documentary                2.67
## 16      Action                2.65

3.4 Number of films returned late, early, and on time

re_time
##    result count
## 1      NA   183
## 2    late  6403
## 3 on time  1720
## 4   early  7738

E. Countries that Rent A Film have a presence with their sales and customer base

have_presence
##                                   country customer_base total_sale
## 1                                   India            60    6034.78
## 2                                   China            53    5251.03
## 3                           United States            36    3685.31
## 4                                   Japan            31    3122.51
## 5                                  Mexico            30    2984.82
## 6                                  Brazil            28    2919.19
## 7                      Russian Federation            28    2765.62
## 8                             Philippines            20    2219.70
## 9                                  Turkey            15    1498.49
## 10                              Indonesia            14    1352.69
## 11                                Nigeria            13    1314.92
## 12                              Argentina            13    1298.80
## 13                                 Taiwan            10    1155.10
## 14                           South Africa            11    1069.46
## 15                                   Iran             8     877.96
## 16                         United Kingdom             9     850.96
## 17                                 Poland             8     786.16
## 18                                  Italy             7     753.26
## 19                                Germany             7     741.24
## 20                                Vietnam             6     676.45
## 21                                Ukraine             6     675.53
## 22                               Colombia             6     661.54
## 23                                  Egypt             6     659.48
## 24                              Venezuela             7     632.43
## 25                                 Canada             5     559.70
## 26                            Netherlands             5     557.73
## 27                            South Korea             5     527.77
## 28                                  Spain             5     513.80
## 29                                  Yemen             4     473.93
## 30                               Pakistan             5     473.84
## 31                           Saudi Arabia             5     452.94
## 32                                   Peru             4     407.01
## 33                               Thailand             3     401.08
## 34                                 Israel             4     379.13
## 35                                Ecuador             3     369.18
## 36                             Bangladesh             3     353.19
## 37                                Algeria             3     349.18
## 38                                 France             4     334.12
## 39                               Malaysia             3     330.23
## 40                               Tanzania             3     322.22
## 41                             Mozambique             3     315.25
## 42                   United Arab Emirates             3     305.25
## 43                     Dominican Republic             3     304.26
## 44                                  Chile             3     303.34
## 45                                Austria             3     284.30
## 46                                Morocco             3     274.35
## 47                               Paraguay             3     273.40
## 48                                Belarus             2     271.36
## 49                                 Latvia             2     249.43
## 50                            Switzerland             3     248.41
## 51                                  Kenya             2     245.49
## 52                             Yugoslavia             2     233.49
## 53                            Puerto Rico             2     224.48
## 54                                Romania             2     218.42
## 55                                 Runion             1     211.55
## 56                       French Polynesia             2     205.52
## 57                                 Greece             2     204.54
## 58                                  Sudan             2     202.51
## 59                             Azerbaijan             2     198.53
## 60                               Bulgaria             2     194.52
## 61                              Kazakstan             2     192.51
## 62                                 Angola             2     187.55
## 63                               Cameroon             2     186.49
## 64                                Myanmar             2     179.53
## 65                               Cambodia             2     179.51
## 66                                Bolivia             2     178.56
## 67  Congo, The Democratic Republic of the             2     168.58
## 68                                   Oman             2     161.56
## 69          Holy See (Vatican City State)             1     146.68
## 70                                  Nauru             1     143.70
## 71                                 Sweden             1     139.67
## 72                         Czech Republic             1     132.72
## 73                                Moldova             1     127.66
## 74                           Turkmenistan             1     126.74
## 75                                   Chad             1     122.72
## 76                                 Malawi             1     121.73
## 77                                 Zambia             1     121.70
## 78                   Virgin Islands, U.S.             1     121.69
## 79                              Greenland             1     119.72
## 80                                Armenia             1     118.75
## 81                                 Gambia             1     114.73
## 82                                   Iraq             1     111.73
## 83                                Hungary             1     111.71
## 84                                Bahrain             1     108.76
## 85                            North Korea             1     107.71
## 86                                 Brunei             1     107.66
## 87                                 Kuwait             1     106.75
## 88                                Estonia             1     105.72
## 89                              Hong Kong             1     104.76
## 90                              Sri Lanka             1     103.73
## 91                          Liechtenstein             1      99.74
## 92                               Anguilla             1      99.68
## 93                          French Guiana             1      97.80
## 94                          Faroe Islands             1      96.76
## 95                                Senegal             1      95.76
## 96                                  Nepal             1      93.83
## 97                                 Tuvalu             1      93.78
## 98                             Madagascar             1      92.79
## 99                               Ethiopia             1      91.77
## 100                           New Zealand             1      85.77
## 101                              Slovakia             1      80.77
## 102                               Finland             1      78.79
## 103                               Tunisia             1      73.78
## 104                           Afghanistan             1      67.82
## 105                                 Tonga             1      64.84
## 106      Saint Vincent and the Grenadines             1      64.82
## 107                             Lithuania             1      63.78
## 108                        American Samoa             1      47.85

4 In-Depth Analysis

4.1 Real popularity of genres

# get required variables
nt = genre_rented %>% select(name, count_genre)

# draw plot in descending order
nt %>% 
  plot_ly(x = ~name, y = ~count_genre) %>%
  add_bars() %>%
  layout(title ="Popularity of Genres (without Adjustment)",
    autosize = T,
    yaxis = list(title = '# of rents', range = c(600, 1200), showline = TRUE),
    xaxis = list(type = 'category', title = 'Genre',  categoryorder = "array", categoryarray = ~c("Sports", "Animation", "Action", "Sci-Fi",  "Family", "Foreign", "Drama", "Documentary", "Games", "New", "Children", "Classics",  "Comedy", "Horror", "Travel", "Music") ))

Question: Sports seems to be the most popular and Music seems to be the least. Is this really so?

# connect category to film_id
cf = merge(x = category, y = film_category, by = "category_id", y.all = TRUE)

# number of disticnt films per category
ci_s = cf %>% group_by(category_id) %>%
  summarise(sum = sum(film_id/film_id))

# get variables needed
dfg = merge(x = category, y = ci_s) %>% select(name, sum)

# draw plot in descending order
dfg %>% 
  plot_ly(x = ~name, y = ~sum) %>%
  add_bars() %>%
  layout(title ="Distinct Titles per Genre",
    autosize = T,
    yaxis = list(title = '# of Distinct Movies', range = c(45, 80), showline = TRUE),
    xaxis = list(type = 'category', title = 'Genre',  categoryorder = "array", categoryarray = ~c("Sports", "Animation", "Action", "Sci-Fi",  "Family", "Foreign", "Drama", "Documentary", "Games", "New", "Children", "Classics",  "Comedy", "Horror", "Travel", "Music")))

Insight: Sports has the most many number of distinct titles and music has the least. Given this, Sport and Music might not be the most and least poular genres respectively.

# create adjusted rent count
rrc = merge(x=nt, y=dfg) %>% transmute(name, arc =  count_genre / sum)

rrc %>% 
  plot_ly(x = ~name, y = ~arc, color = 'red') %>%
  add_bars() %>%
  layout(title ="Popularity of Genres (with Adjustment)",
    autosize = T,
    yaxis = list(title = '# of Rents', range = c(10, 18), showline = TRUE),
    xaxis = list(type = 'category', title = 'Genre',  categoryorder = "array", categoryarray = ~c("Sports", "Animation", "Action", "Sci-Fi",  "Family", "Foreign", "Drama", "Documentary", "Games", "New", "Children", "Classics",  "Comedy", "Horror", "Travel", "Music") ))

Insight: the plot above represents # of rents per genre divided by distinct titles per genre. It is observable that Sports is not as popular given its large number of distinct titles. Music is actually more popular than Sports in this respect.

4.2 ROI (Return on Investment) of genres

# inventory count per film
icf = inventory %>% count(film_id)

# select need var
nn = merge(icf, cf) %>% select(name, n) 

# investment by genre
ig = nn %>% group_by(name) %>%
  summarise(sum = sum(n))

# get sales per genre 
gs = genre_rented %>% select(name, total_sale)

git = merge(ig, gs)

# regression
ts_lm  <- lm(total_sale ~ sum, data = git)

git %>%
  plot_ly(x = ~sum, y = ~total_sale, color = ~name, hoverinfo = "text",
          text = ~paste0("<b>", name, "</b> <br>",
                         "Sales: ", total_sale, "<br>",
                         "Inventory: ", sum)) %>%
  
  #add_lines(y = ~fitted(ts_lm), name = "Regression Line") %>%
  layout(title = 'Relationship between Investment and Sales (by genre)',
      xaxis = list(title = 'Inventory', zeroline = TRUE),
      yaxis = list(title = 'Sales per Genre'))

Insight: this plot shows whether each genre is generating enough sales proportionate to its inventory. The genre Childern and Classics are somewhat lagging behind, and Comedy is showing a good performance. However, there is no outlier probably because each genre contains many films.

4.3 ROI (Return on Investment) of each film

# get only the needed variables
ra = payment %>% select(rental_id, amount)
ri = rental %>% select(rental_id, inventory_id)
fi = inventory %>% select(inventory_id, film_id)
ftr = film %>% select(film_id, title, rental_duration)

# sum sales by rental_id
rs = ra %>% group_by(rental_id) %>%
  summarise(sum = sum(amount))

# get sales per inventory_id
spi = merge(x=rs,y=ri,by="rental_id", all.y=TRUE)

# na to 0
spi[is.na(spi)] = 0

# film_id and sum
fs = merge(x=spi, y=fi, by = 'inventory_id') %>% select(sum, film_id) 

# inventory per film
ipf = fs %>% count(film_id)

# sales per film
spf = fs %>% group_by(film_id) %>% summarise(sales_per_film = sum(sum))

# sales and inventory of films
fsi = merge(spf, ipf, by='film_id')

tsnr = merge(fsi, ftr, by='film_id', all.y = TRUE) %>% select(title, sales_per_film, n, rental_duration)

tsnr = tsnr[!is.na(tsnr$n),]

tsnr = mutate(tsnr, ratio = sales_per_film / n)

bin = cut(tsnr$ratio, 3, labels = c('High', 'Mid', 'Low'), include.lowest = FALSE, right = TRUE, ordered_result = FALSE)

tsnr = cbind(tsnr, bin)

tsnr %>%
  plot_ly(x = ~n, y = ~sales_per_film, color = bin, hoverinfo = "text",
          text = ~paste0("<b>", title, "</b> <br>",
                         "Sales: ", sales_per_film, "<br>",
                         "Inventory: ", n, "<br>",
                         "Sales/Inventory: ", round(ratio, digit=2))) %>%
  
  #add_lines(y = ~fitted(ts_lm), name = "Regression Line") %>%
  layout(title = 'Relationship between Investment and Sales (by film)',
      xaxis = list(title = 'Inventory', zeroline = TRUE),
      yaxis = list(title = 'Sales per Film'))

Insight: this plot shows whether each film is generating enough sales with respect to its inventory. The three groups equally divides the lists of films according to their ratio of sales to invesment (inventory). By examining this table, we could know the what films need more inventory and what films the owner should not purchase more.

4.4 Sales Trend

# select only the relevant variables
payment = payment %>% select(amount, payment_date)

payment$payment_date = format(as.Date(payment$payment_date), "%Y-%m")

# sales per month
spm = payment %>% group_by(payment_date) %>% summarise(sales_per_month = sum(amount))

plot_ly(opacity = 1) %>% 
    add_lines(x = ~spm$payment_date, y = ~spm$sales_per_month, name = "Sales Trend") %>%
    layout(xaxis = list(title = 'Sales Amount'), yaxis = list(title = 'Time'))

Insight: although the time horizon for this dataset is not long enough to draw quality insight, trend analysis is a case worth practicing.

5 Reference

5.1 Source of the Data